Exploratory Analysis

Import data and process data

In [1]:
# Import the packages 
import IPython
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
from statistics import mode
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline

Data Preview

In [2]:
# Read the dataset into the dataframe 
data = pd.read_csv("diabetic_data.csv")
In [3]:
# Check the size of the dataset 
data.shape
Out[3]:
(101766, 50)
In [4]:
# Check the overall information:data typle and null values
data.info()
"""The columns do not contain null values, so recheck the object data type"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            101766 non-null int64
diag_1                      101766 non-null object
diag_2                      101766 non-null object
diag_3                      101766 non-null object
number_diagnoses            101766 non-null int64
max_glu_serum               101766 non-null object
A1Cresult                   101766 non-null object
metformin                   101766 non-null object
repaglinide                 101766 non-null object
nateglinide                 101766 non-null object
chlorpropamide              101766 non-null object
glimepiride                 101766 non-null object
acetohexamide               101766 non-null object
glipizide                   101766 non-null object
glyburide                   101766 non-null object
tolbutamide                 101766 non-null object
pioglitazone                101766 non-null object
rosiglitazone               101766 non-null object
acarbose                    101766 non-null object
miglitol                    101766 non-null object
troglitazone                101766 non-null object
tolazamide                  101766 non-null object
examide                     101766 non-null object
citoglipton                 101766 non-null object
insulin                     101766 non-null object
glyburide-metformin         101766 non-null object
glipizide-metformin         101766 non-null object
glimepiride-pioglitazone    101766 non-null object
metformin-rosiglitazone     101766 non-null object
metformin-pioglitazone      101766 non-null object
change                      101766 non-null object
diabetesMed                 101766 non-null object
readmitted                  101766 non-null object
dtypes: int64(13), object(37)
memory usage: 38.8+ MB
Out[4]:
'The columns do not contain null values, so recheck the object data type'
In [5]:
# recheck the object data type 
for i in data.columns:
    if data[i].dtype==object:
        d= data[i].value_counts()
        print(pd.DataFrame(data=d))
"""There are no null values in the dataset"""
                  race
Caucasian        76099
AfricanAmerican  19210
?                 2273
Hispanic          2037
Other             1506
Asian              641
                 gender
Female            54708
Male              47055
Unknown/Invalid       3
            age
[70-80)   26068
[60-70)   22483
[50-60)   17256
[80-90)   17197
[40-50)    9685
[30-40)    3775
[90-100)   2793
[20-30)    1657
[10-20)     691
[0-10)      161
           weight
?           98569
[75-100)     1336
[50-75)       897
[100-125)     625
[125-150)     145
[25-50)        97
[0-25)         48
[150-175)      35
[175-200)      11
>200            3
    payer_code
?        40256
MC       32439
HM        6274
SP        5007
BC        4655
MD        3532
CP        2533
UN        2448
CM        1937
OG        1033
PO         592
DM         549
CH         146
WC         135
OT          95
MP          79
SI          55
FR           1
                                   medical_specialty
?                                              49949
InternalMedicine                               14635
Emergency/Trauma                                7565
Family/GeneralPractice                          7440
Cardiology                                      5352
Surgery-General                                 3099
Nephrology                                      1613
Orthopedics                                     1400
Orthopedics-Reconstructive                      1233
Radiologist                                     1140
Pulmonology                                      871
Psychiatry                                       854
Urology                                          685
ObstetricsandGynecology                          671
Surgery-Cardiovascular/Thoracic                  652
Gastroenterology                                 564
Surgery-Vascular                                 533
Surgery-Neuro                                    468
PhysicalMedicineandRehabilitation                391
Oncology                                         348
Pediatrics                                       254
Hematology/Oncology                              207
Neurology                                        203
Pediatrics-Endocrinology                         159
Otolaryngology                                   125
Endocrinology                                    120
Surgery-Thoracic                                 109
Psychology                                       101
Podiatry                                         100
Surgery-Cardiovascular                            98
...                                              ...
Anesthesiology-Pediatric                          19
Obstetrics                                        19
Rheumatology                                      17
Pathology                                         17
OutreachServices                                  12
Anesthesiology                                    12
PhysicianNotFound                                 11
Surgery-Colon&Rectal                              11
Surgery-Maxillofacial                             11
Pediatrics-Neurology                              10
Surgery-Pediatric                                  8
Endocrinology-Metabolism                           8
Psychiatry-Child/Adolescent                        7
Cardiology-Pediatric                               7
AllergyandImmunology                               7
DCPTEAM                                            6
Dentistry                                          4
Pediatrics-Hematology-Oncology                     4
Pediatrics-AllergyandImmunology                    3
Pediatrics-EmergencyMedicine                       3
Resident                                           2
Surgery-PlasticwithinHeadandNeck                   1
Speech                                             1
SportsMedicine                                     1
Perinatology                                       1
Psychiatry-Addictive                               1
Neurophysiology                                    1
Dermatology                                        1
Pediatrics-InfectiousDiseases                      1
Proctology                                         1

[73 rows x 1 columns]
        diag_1
428       6862
414       6581
786       4016
410       3614
486       3508
427       2766
491       2275
715       2151
682       2042
434       2028
780       2019
996       1967
276       1889
38        1688
250.8     1680
599       1595
584       1520
V57       1207
250.6     1183
518       1115
820       1082
577       1057
493       1056
435       1016
562        989
574        965
296        896
560        876
250.7      871
250.13     851
...        ...
V07          1
837          1
704          1
915          1
817          1
365          1
314          1
637          1
838          1
870          1
250.51       1
347          1
148          1
691          1
523          1
605          1
V43          1
84           1
974          1
373          1
471          1
640          1
V51          1
839          1
690          1
911          1
988          1
700          1
299          1
363          1

[717 rows x 1 columns]
        diag_2
276       6752
428       6662
250       6071
427       5036
401       3736
496       3305
599       3288
403       2823
414       2650
411       2566
250.02    2074
707       1999
585       1871
584       1649
491       1545
250.01    1523
285       1520
780       1491
425       1434
682       1433
486       1379
518       1355
424       1071
413       1042
250.6      895
493        881
305        702
786        644
280        606
998        571
...        ...
140          1
E829         1
833          1
111          1
E980         1
52           1
7            1
E919         1
683          1
670          1
96           1
364          1
977          1
V60          1
250.31       1
523          1
843          1
529          1
880          1
615          1
115          1
E854         1
987          1
460          1
130          1
E850         1
E918         1
E929         1
948          1
V69          1

[749 rows x 1 columns]
        diag_3
250      11555
401       8289
276       5175
428       4577
427       3955
414       3664
496       2605
403       2357
585       1992
272       1969
599       1941
?         1423
V45       1389
250.02    1369
707       1360
780       1334
285       1200
425       1136
250.6     1080
424       1063
584        963
305        924
250.01     915
682        887
518        854
41         727
493        694
278        680
530        625
786        584
...        ...
E883         1
657          1
365.44       1
E886         1
308          1
838          1
E922         1
669          1
755          1
987          1
385          1
890          1
265          1
684          1
47           1
879          1
944          1
66           1
877          1
884          1
E876         1
872          1
876          1
970          1
146          1
370          1
930          1
880          1
175          1
E955         1

[790 rows x 1 columns]
      max_glu_serum
None          96420
Norm           2597
>200           1485
>300           1264
      A1Cresult
None      84748
>8         8216
Norm       4990
>7         3812
        metformin
No          81778
Steady      18346
Up           1067
Down          575
        repaglinide
No           100227
Steady         1384
Up              110
Down             45
        nateglinide
No           101063
Steady          668
Up               24
Down             11
        chlorpropamide
No              101680
Steady              79
Up                   6
Down                 1
        glimepiride
No            96575
Steady         4670
Up              327
Down            194
        acetohexamide
No             101765
Steady              1
        glipizide
No          89080
Steady      11356
Up            770
Down          560
        glyburide
No          91116
Steady       9274
Up            812
Down          564
        tolbutamide
No           101743
Steady           23
        pioglitazone
No             94438
Steady          6976
Up               234
Down             118
        rosiglitazone
No              95401
Steady           6100
Up                178
Down               87
        acarbose
No        101458
Steady       295
Up            10
Down           3
        miglitol
No        101728
Steady        31
Down           5
Up             2
        troglitazone
No            101763
Steady             3
        tolazamide
No          101727
Steady          38
Up               1
    examide
No   101766
    citoglipton
No       101766
        insulin
No        47383
Steady    30849
Down      12218
Up        11316
        glyburide-metformin
No                   101060
Steady                  692
Up                        8
Down                      6
        glipizide-metformin
No                   101753
Steady                   13
        glimepiride-pioglitazone
No                        101765
Steady                         1
        metformin-rosiglitazone
No                       101764
Steady                        2
        metformin-pioglitazone
No                      101765
Steady                       1
    change
No   54755
Ch   47011
     diabetesMed
Yes        78363
No         23403
     readmitted
NO        54864
>30       35545
<30       11357
Out[5]:
'There are no null values in the dataset'
In [6]:
# Check the numerical data values 
data.describe().transpose()
Out[6]:
count mean std min 25% 50% 75% max
encounter_id 101766.0 1.652016e+08 1.026403e+08 12522.0 84961194.0 152388987.0 2.302709e+08 443867222.0
patient_nbr 101766.0 5.433040e+07 3.869636e+07 135.0 23413221.0 45505143.0 8.754595e+07 189502619.0
admission_type_id 101766.0 2.024006e+00 1.445403e+00 1.0 1.0 1.0 3.000000e+00 8.0
discharge_disposition_id 101766.0 3.715642e+00 5.280166e+00 1.0 1.0 1.0 4.000000e+00 28.0
admission_source_id 101766.0 5.754437e+00 4.064081e+00 1.0 1.0 7.0 7.000000e+00 25.0
time_in_hospital 101766.0 4.395987e+00 2.985108e+00 1.0 2.0 4.0 6.000000e+00 14.0
num_lab_procedures 101766.0 4.309564e+01 1.967436e+01 1.0 31.0 44.0 5.700000e+01 132.0
num_procedures 101766.0 1.339730e+00 1.705807e+00 0.0 0.0 1.0 2.000000e+00 6.0
num_medications 101766.0 1.602184e+01 8.127566e+00 1.0 10.0 15.0 2.000000e+01 81.0
number_outpatient 101766.0 3.693572e-01 1.267265e+00 0.0 0.0 0.0 0.000000e+00 42.0
number_emergency 101766.0 1.978362e-01 9.304723e-01 0.0 0.0 0.0 0.000000e+00 76.0
number_inpatient 101766.0 6.355659e-01 1.262863e+00 0.0 0.0 0.0 1.000000e+00 21.0
number_diagnoses 101766.0 7.422607e+00 1.933600e+00 1.0 6.0 8.0 9.000000e+00 16.0

"""description: 1) encounter_id and patient_nbr need to be cleaned; 2) the maximum days for stay in hospital is 14 days 3) average num_lab_procedures=43 4) average num_medications=16 5) average num_diagnoses=7.4 """

Attribute Plotting

Readmission Rate Count

In [7]:
target_count=data['readmitted'].value_counts()
target_count.plot(kind='bar', title='Readmission_count')
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x10be2d048>

Age and Gender Count

In [8]:
age_count=data["age"].value_counts()
gender_count=data["gender"].value_counts()# Have to deal with unkown/invalid count 
fig,(ax1, ax2)= plt.subplots(1,2,figsize=(16,5))
_ = age_count.plot(kind='bar',title='Age Distribtion',ax=ax1)
_ = gender_count.plot(kind='bar',title='Gender Distribtion',ax=ax2)

Race and Weight Count

In [9]:
race_count=data["race"].value_counts()
weight_count=data["weight"].value_counts()# Have to deal with unkown/invalid count 
fig,(ax1, ax2)= plt.subplots(1,2,figsize=(16,5))
_ = race_count.plot(kind='bar',title='Race Distribtion',ax=ax1)# Missing values
_ = weight_count.plot(kind='bar',title='Weight Distribtion',ax=ax2)# Missing values 

Numerical data Distribution

In [10]:
num_col = list(set(list(data._get_numeric_data().columns))- {'readmitted'})
sns.set()
sns.pairplot(data[num_col], size = 2.5)
plt.show();

Data Processing

In [11]:
# Make a copy of the file for pre-processing 
train = data.copy(deep=True)

Processing object-type missing values

In [12]:
df = []
# Checkthe percentage of the missing values 
for col in train.columns:
    if train[col].dtype == object:
        count_missing = train[col][train[col] == '?'].count()
        percent_missing = (count_missing.sum()/train.shape[0]*100).round(2)
#         print(col,count_missing,percent_missing)
        df.append([col,count_missing,percent_missing])
missing_value=pd.DataFrame(df,columns=["col","count_missing","percent_missing"]).sort_values(by="percent_missing",ascending=False)
missing_value
Out[12]:
col count_missing percent_missing
3 weight 98569 96.86
5 medical_specialty 49949 49.08
4 payer_code 40256 39.56
0 race 2273 2.23
8 diag_3 1423 1.40
7 diag_2 358 0.35
6 diag_1 21 0.02
29 glyburide-metformin 0 0.00
24 troglitazone 0 0.00
25 tolazamide 0 0.00
26 examide 0 0.00
27 citoglipton 0 0.00
28 insulin 0 0.00
33 metformin-pioglitazone 0 0.00
30 glipizide-metformin 0 0.00
31 glimepiride-pioglitazone 0 0.00
32 metformin-rosiglitazone 0 0.00
22 acarbose 0 0.00
34 change 0 0.00
35 diabetesMed 0 0.00
23 miglitol 0 0.00
18 glyburide 0 0.00
21 rosiglitazone 0 0.00
20 pioglitazone 0 0.00
19 tolbutamide 0 0.00
1 gender 0 0.00
17 glipizide 0 0.00
16 acetohexamide 0 0.00
15 glimepiride 0 0.00
14 chlorpropamide 0 0.00
13 nateglinide 0 0.00
12 repaglinide 0 0.00
11 metformin 0 0.00
10 A1Cresult 0 0.00
9 max_glu_serum 0 0.00
2 age 0 0.00
36 readmitted 0 0.00
In [13]:
"""High missing values: 1. weight 2. medical_specialty 3. payer_code 4. race /
     Other values to clean: 1. diagosis_1 2. diagosis_2 3. diagnosis_3"""
Out[13]:
'High missing values: 1. weight 2. medical_specialty 3. payer_code 4. race /\n     Other values to clean: 1. diagosis_1 2. diagosis_2 3. diagnosis_3'

Drop high missing values

In [14]:
# drop the irrelavant and high missing value variables
# drop weight because of its high missing value 97% 
# drop medical_specialty because of its high missing value 49.8%
train=train.drop(['weight','medical_specialty'],axis=1)
# drop payer_code because it is irrelevant to our studies 
train=train.drop(['payer_code'],axis=1)
# Drop only the missing values in all three diagonosis categories 
train = train.drop(set(train[(train['diag_1']== '?') & (train['diag_2'] == '?') & (train['diag_3'] == '?')].index))
# Drop the patients who is expired after discharge which is out of our interest population
train = train.drop(set(train[train['discharge_disposition_id']==11].index))
In [15]:
train.shape
Out[15]:
(100123, 47)

Drop missing values in 'Gender' and 'Race'

In [16]:
# Process the missing values in gender
print('gender', train['gender'][train['gender'] == 'Unknown/Invalid'].count()) #only 3 missing values 
train = train.drop(set(train['gender'][train['gender'] == 'Unknown/Invalid'].index))
# Process the missing values in race
train= train.drop(set(train['race'][train['race']=='?'].index)) # drop the 2273 missing values 
gender 3
In [17]:
train.shape
Out[17]:
(97883, 47)

Drop irrelevant variables

In [18]:
# Drop irrelevant variables and high percent missing varaibles 
train=train.drop(["encounter_id","patient_nbr"],axis=1)
In [19]:
# Drop two attributes citoglipton and examide that have the all same value
train=train.drop(["citoglipton","examide"],axis = 1)

Creating/recoding new features

In [20]:
train.shape
Out[20]:
(97883, 43)
In [21]:
"""Recoding Plan: In this section, we recode the following variables:
   1. new variable patient_service: This new feature includes the patients' use of hospital service
   2. new variable med_change: This new feature calculate the change of medication dose of a patient
   3. new variable num_med: This new feature calculate the number of medication usage of each patient """
Out[21]:
"Recoding Plan: In this section, we recode the following variables:\n   1. new variable patient_service: This new feature includes the patients' use of hospital service\n   2. new variable med_change: This new feature calculate the change of medication dose of a patient\n   3. new variable num_med: This new feature calculate the number of medication usage of each patient "

Create new feature: patient_service

In [22]:
# Create a new feature service_utilization
train['patient_service'] = train['number_outpatient'] + train['number_emergency'] + train['number_inpatient']

Create new feature: med_change

In [23]:
# Create a new variable to calculate the change of medication dose
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 
        'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 
        'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
In [24]:
for col in keys:
    col_name = str(col) + 'new' #change the column name 
    train[col_name] = train[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)# recode the med use into binary variable
train['med_change'] = 0
for col in keys:
    col_name = str(col) + 'new'
    train['med_change'] = train['med_change'] + train[col_name]
    del train[col_name]
In [25]:
# Check the status of the new feature med_change
train['med_change'].value_counts()
Out[25]:
0    71177
1    25307
2     1288
3      106
4        5
Name: med_change, dtype: int64

Create new feature: num_med

In [26]:
# calculate the number of medications used for each patient 
for col in keys:
    train[col] = train[col].replace('No', 0)
    train[col] = train[col].replace('Steady', 1)
    train[col] = train[col].replace('Up', 1)
    train[col] = train[col].replace('Down', 1) 
train['num_med'] = 0
for col in keys:
    train['num_med'] = train['num_med'] + train[col]
In [27]:
train['num_med'].value_counts()
Out[27]:
1    45365
0    22454
2    21165
3     7537
4     1300
5       57
6        5
Name: num_med, dtype: int64

Recode diagnosis into new categories

In [28]:
"""diag_1=primary diagnosis; diag_2=secondary diagnosis; diag_3=additional diagnosis\
i"""
# duplicate a diagnosis column
train['primary_diag'] = train['diag_1']
train['secondary_diag'] = train['diag_2']
train['additional_diag'] = train['diag_3']
In [29]:
# replace the unknown values "?" with -1
train['primary_diag'] = train['primary_diag'].replace('?', -1)
train['secondary_diag'] = train['secondary_diag'].replace('?', -1)
train['additional_diag'] = train['additional_diag'].replace('?', -1)
In [30]:
# recode ICE code of V or E to “other” category = 0
train.loc[train['diag_1'].str.contains('V'), ['primary_diag']] = 0
train.loc[train['diag_1'].str.contains('E'), ['primary_diag']] = 0
train.loc[train['diag_2'].str.contains('V'), ['secondary_diag']] = 0
train.loc[train['diag_2'].str.contains('E'), ['secondary_diag']] = 0
train.loc[train['diag_3'].str.contains('V'), ['additional_diag']] = 0
train.loc[train['diag_3'].str.contains('E'), ['additional_diag']] = 0
In [31]:
# convert the data type to float to enable computations later
train['primary_diag'] = train['primary_diag'].astype(float)
train['secondary_diag'] = train['secondary_diag'].astype(float)
train['additional_diag'] = train['additional_diag'].astype(float)
In [32]:
# recode ICD codes of diag1,diag2,diag3 between certain ranges to certain categories
for index, row in train.iterrows():
    if (row['primary_diag'] >= 390 and row['primary_diag'] < 460) or (np.floor(row['primary_diag']) == 785):
        train.loc[index, 'primary_diag'] = 1
    elif (row['primary_diag'] >= 460 and row['primary_diag'] < 520) or (np.floor(row['primary_diag']) == 786):
        train.loc[index, 'primary_diag'] = 2
    elif (row['primary_diag'] >= 520 and row['primary_diag'] < 580) or (np.floor(row['primary_diag']) == 787):
        train.loc[index, 'primary_diag'] = 3
    elif (np.floor(row['primary_diag']) == 250):
        train.loc[index, 'primary_diag'] = 4
    elif (row['primary_diag'] >= 800 and row['primary_diag'] < 1000):
        train.loc[index, 'primary_diag'] = 5
    elif (row['primary_diag'] >= 710 and row['primary_diag'] < 740):
        train.loc[index, 'primary_diag'] = 6
    elif (row['primary_diag'] >= 580 and row['primary_diag'] < 630) or (np.floor(row['primary_diag']) == 788):
        train.loc[index, 'primary_diag'] = 7
    elif (row['primary_diag'] >= 140 and row['primary_diag'] < 240):
        train.loc[index, 'primary_diag'] = 8
    else:
        train.loc[index, 'primary_diag'] = 0
In [33]:
for index, row in train.iterrows():
    if (row['secondary_diag'] >= 390 and row['secondary_diag'] < 460) or (np.floor(row['secondary_diag']) == 785):
        train.loc[index, 'secondary_diag'] = 1
    elif (row['secondary_diag'] >= 460 and row['secondary_diag'] < 520) or (np.floor(row['secondary_diag']) == 786):
        train.loc[index, 'secondary_diag'] = 2
    elif (row['secondary_diag'] >= 520 and row['secondary_diag'] < 580) or (np.floor(row['secondary_diag']) == 787):
        train.loc[index, 'secondary_diag'] = 3
    elif (np.floor(row['secondary_diag']) == 250):
        train.loc[index, 'secondary_diag'] = 4
    elif (row['secondary_diag'] >= 800 and row['secondary_diag'] < 1000):
        train.loc[index, 'secondary_diag'] = 5
    elif (row['secondary_diag'] >= 710 and row['secondary_diag'] < 740):
        train.loc[index, 'secondary_diag'] = 6
    elif (row['secondary_diag'] >= 580 and row['secondary_diag'] < 630) or (np.floor(row['secondary_diag']) == 788):
        train.loc[index, 'secondary_diag'] = 7
    elif (row['secondary_diag'] >= 140 and row['secondary_diag'] < 240):
        train.loc[index, 'secondary_diag'] = 8
    else:
        train.loc[index, 'secondary_diag'] = 0
In [34]:
for index, row in train.iterrows():
    if (row['additional_diag'] >= 390 and row['additional_diag'] < 460) or (np.floor(row['additional_diag']) == 785):
        train.loc[index, 'additional_diag'] = 1
    elif (row['additional_diag'] >= 460 and row['additional_diag'] < 520) or (np.floor(row['additional_diag']) == 786):
        train.loc[index, 'additional_diag'] = 2
    elif (row['additional_diag'] >= 520 and row['additional_diag'] < 580) or (np.floor(row['additional_diag']) == 787):
        train.loc[index, 'additional_diag'] = 3
    elif (np.floor(row['additional_diag']) == 250):
        train.loc[index, 'additional_diag'] = 4
    elif (row['additional_diag'] >= 800 and row['additional_diag'] < 1000):
        train.loc[index, 'additional_diag'] = 5
    elif (row['additional_diag'] >= 710 and row['additional_diag'] < 740):
        train.loc[index, 'additional_diag'] = 6
    elif (row['additional_diag'] >= 580 and row['additional_diag'] < 630) or (np.floor(row['additional_diag']) == 788):
        train.loc[index, 'additional_diag'] = 7
    elif (row['additional_diag'] >= 140 and row['additional_diag'] < 240):
        train.loc[index, 'additional_diag'] = 8
    else:
        train.loc[index, 'additional_diag'] = 0

Recode admission_type_id and discharge_disposition_id and admission_source_id

In [39]:
"""Use the similarity of the categories to recode and reduct the categories"""
Out[39]:
'Use the similarity of the categories to recode and reduct the categories'
In [35]:
train['admission_type_id'].value_counts()
Out[35]:
1    51818
3    18383
2    17542
6     5162
5     4634
8      317
7       17
4       10
Name: admission_type_id, dtype: int64
In [36]:
# recode admission type (1-Emergency 5-Referral)
train['admission_type_id'] = train['admission_type_id'].replace(2,1)
train['admission_type_id'] = train['admission_type_id'].replace(7,1)
train['admission_type_id'] = train['admission_type_id'].replace(6,5)
train['admission_type_id'] = train['admission_type_id'].replace(8,5)
In [37]:
train['discharge_disposition_id'].sort_values(ascending=False,inplace=False).value_counts()
Out[37]:
1     58767
3     13614
6     12698
18     3673
2      2061
22     1981
5      1143
25      973
4       765
7       611
23      406
13      396
14      369
28      138
8       107
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64
In [38]:
# recode discharge_disposition_id type (1-discharged to home 2-other 3-? 4-?)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(6,1)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(8,1)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(9,1)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(13,1)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(3,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(4,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(5,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(14,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(22,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(23,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(24,2)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(12,10)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(15,10)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(16,10)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(17,10)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(25,18)
train['discharge_disposition_id'] = train['discharge_disposition_id'].replace(26,18)
In [39]:
train['admission_source_id'].sort_values(ascending=False,inplace=False).value_counts()
Out[39]:
7     55527
1     28617
17     6565
4      2936
6      1969
2      1023
5       808
3       185
20      158
9        54
8        15
22       12
10        7
11        2
14        2
25        2
13        1
Name: admission_source_id, dtype: int64
In [40]:
# recode ad_mission_source_id type (1-emergency 4-home 9-transfer 11-other) 
train['admission_source_id'] = train['admission_source_id'].replace(2,1)
train['admission_source_id'] = train['admission_source_id'].replace(3,1)
train['admission_source_id'] = train['admission_source_id'].replace(5,4)
train['admission_source_id'] = train['admission_source_id'].replace(6,4)
train['admission_source_id'] = train['admission_source_id'].replace(10,4)
train['admission_source_id'] = train['admission_source_id'].replace(22,4)
train['admission_source_id'] = train['admission_source_id'].replace(25,4)
train['admission_source_id'] = train['admission_source_id'].replace(7,9)
train['admission_source_id'] = train['admission_source_id'].replace(17,9)
train['admission_source_id'] = train['admission_source_id'].replace(20,9)
train['admission_source_id'] = train['admission_source_id'].replace(21,9)
train['admission_source_id'] = train['admission_source_id'].replace(13,11)
train['admission_source_id'] = train['admission_source_id'].replace(14,11)

Create dummy Variables-change, gender and diabetesMed

In [41]:
# Recode change 
train['change'] = train['change'].replace('Ch', 1)
train['change'] = train['change'].replace('No', 0)
# Recode gender 
train['gender'] = train['gender'].replace('Male', 1)
train['gender'] = train['gender'].replace('Female', 0)
# Recode diabetesMed
train['diabetesMed'] = train['diabetesMed'].replace('Yes', 1)
train['diabetesMed'] = train['diabetesMed'].replace('No', 0)

Recode the outcome variable-Readmitted

In [42]:
"""Since our interest here is the readmission rate that is less than 30 days??"""
# calculate the outcome variable readmission
train['readmitted'] = train['readmitted'].replace('>30', 0)
train['readmitted'] = train['readmitted'].replace('<30', 1)
train['readmitted'] = train['readmitted'].replace('NO', 0)

Recode the ordinal variable-Age

In [43]:
train['age'].sort_values(ascending=False).value_counts()
Out[43]:
[70-80)     24978
[60-70)     21699
[50-60)     16743
[80-90)     16320
[40-50)      9408
[30-40)      3689
[90-100)     2601
[20-30)      1604
[10-20)       682
[0-10)        159
Name: age, dtype: int64
In [44]:
#recode age using mean of each order 
"""recode it by means is the trade-off of uisng the continuous 1-10. Using the mean can maintain the interpretability of the age"""
age_dict = {'[0-10)':5, '[10-20)':15, '[20-30)':25, '[30-40)':35, '[40-50)':45, '[50-60)':55, '[60-70)':65, '[70-80)':75, '[80-90)':85, '[90-100)':95}
train['age'] = train.age.map(age_dict)
train['age'] = train['age'].astype('int64')

Recode the categorical variables: Race, A1Cresult,max_glu_serum

In [45]:
train['race'].sort_values(ascending=False,inplace=False).value_counts()
Out[45]:
Caucasian          74854
AfricanAmerican    18888
Hispanic            2024
Other               1485
Asian                632
Name: race, dtype: int64
In [46]:
train['race'] = train['race'].replace('Caucasian', 1)
train['race'] = train['race'].replace('AfricanAmerican', 2)
train['race'] = train['race'].replace('Hispanic', 3)
train['race'] = train['race'].replace('Asian', 4) 
train['race'] = train['race'].replace('Other', 0) 
In [47]:
train['A1Cresult'].sort_values(ascending=False,inplace=False).value_counts()
Out[47]:
None    81427
>8       7897
Norm     4857
>7       3702
Name: A1Cresult, dtype: int64
In [48]:
# recode A1Cresult 
train['A1Cresult'] = train['A1Cresult'].replace('>7', 1)
train['A1Cresult'] = train['A1Cresult'].replace('>8', 1)
train['A1Cresult'] = train['A1Cresult'].replace('Norm', 0)
train['A1Cresult'] = train['A1Cresult'].replace('None', 99) 
In [49]:
train['max_glu_serum'].sort_values(ascending=False,inplace=False).value_counts()
Out[49]:
None    92712
Norm     2549
>200     1421
>300     1201
Name: max_glu_serum, dtype: int64
In [50]:
# recode max_glu_serum using the similar approach
train['max_glu_serum'] = train['max_glu_serum'].replace('>200', 1)
train['max_glu_serum'] = train['max_glu_serum'].replace('>300', 1)
train['max_glu_serum'] = train['max_glu_serum'].replace('Norm', 0)
train['max_glu_serum'] = train['max_glu_serum'].replace('None', 99)
In [51]:
train.to_csv('./modified_diabetes1205_beforeEDA.csv',index=None)

Feature Engineering

In [57]:
train=pd.read_csv('./modified_diabetes1205_beforeEDA.csv')
In [58]:
train.head(5).T
Out[58]:
0 1 2 3 4
race 1 1 2 1 1
gender 0 0 0 1 1
age 5 15 25 35 45
admission_type_id 5 1 1 1 1
discharge_disposition_id 18 1 1 1 1
admission_source_id 1 9 9 9 9
time_in_hospital 1 3 2 2 1
num_lab_procedures 41 59 11 44 51
num_procedures 0 0 5 1 0
num_medications 1 18 13 16 8
number_outpatient 0 0 2 0 0
number_emergency 0 0 0 0 0
number_inpatient 0 0 1 0 0
diag_1 250.83 276 648 8 197
diag_2 ? 250.01 250 250.43 157
diag_3 ? 255 V27 403 250
number_diagnoses 1 9 6 7 5
max_glu_serum 99 99 99 99 99
A1Cresult 99 99 99 99 99
metformin 0 0 0 0 0
repaglinide 0 0 0 0 0
nateglinide 0 0 0 0 0
chlorpropamide 0 0 0 0 0
glimepiride 0 0 0 0 0
acetohexamide 0 0 0 0 0
glipizide 0 0 1 0 1
glyburide 0 0 0 0 0
tolbutamide 0 0 0 0 0
pioglitazone 0 0 0 0 0
rosiglitazone 0 0 0 0 0
acarbose 0 0 0 0 0
miglitol 0 0 0 0 0
troglitazone 0 0 0 0 0
tolazamide 0 0 0 0 0
insulin 0 1 0 1 1
glyburide-metformin 0 0 0 0 0
glipizide-metformin 0 0 0 0 0
glimepiride-pioglitazone 0 0 0 0 0
metformin-rosiglitazone 0 0 0 0 0
metformin-pioglitazone 0 0 0 0 0
change 0 1 0 1 1
diabetesMed 0 1 1 1 1
readmitted 0 0 0 0 0
patient_service 0 0 3 0 0
med_change 0 1 0 1 0
num_med 0 1 1 1 2
primary_diag 4 0 0 0 8
secondary_diag 0 4 4 4 8
additional_diag 0 0 0 1 4

Data type conversion

In [52]:
"""This serves for the numerical data processing and transformation"""
train.dtypes
Out[52]:
race                          int64
gender                        int64
age                           int64
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses              int64
max_glu_serum                 int64
A1Cresult                     int64
metformin                     int64
repaglinide                   int64
nateglinide                   int64
chlorpropamide                int64
glimepiride                   int64
acetohexamide                 int64
glipizide                     int64
glyburide                     int64
tolbutamide                   int64
pioglitazone                  int64
rosiglitazone                 int64
acarbose                      int64
miglitol                      int64
troglitazone                  int64
tolazamide                    int64
insulin                       int64
glyburide-metformin           int64
glipizide-metformin           int64
glimepiride-pioglitazone      int64
metformin-rosiglitazone       int64
metformin-pioglitazone        int64
change                        int64
diabetesMed                   int64
readmitted                    int64
patient_service               int64
med_change                    int64
num_med                       int64
primary_diag                float64
secondary_diag              float64
additional_diag             float64
dtype: object
In [53]:
# convert data type of nominal features in dataframe to 'object' type
i = ['race','gender','age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',\
          'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', \
          'A1Cresult', 'max_glu_serum', 'primary_diag', 'secondary_diag', 'additional_diag']

train[i] = train[i].astype('object')
In [54]:
for i in train.columns:
    if train[i].dtype==object:
        d= train[i].value_counts()
        print(pd.DataFrame(data=d))
    race
1  74854
2  18888
3   2024
0   1485
4    632
   gender
0   52747
1   45136
      age
75  24978
65  21699
55  16743
85  16320
45   9408
35   3689
95   2601
25   1604
15    682
5     159
   admission_type_id
1              69377
3              18383
5              10113
4                 10
    discharge_disposition_id
1                      71989
2                      20387
18                      4646
7                        611
28                       138
10                        97
19                         8
27                         5
20                         2
    admission_source_id
9                 62304
1                 29825
4                  5734
8                    15
11                    5
        diag_1
428       6613
414       6381
786       3938
410       3387
486       3332
427       2676
491       2205
715       2095
780       1984
682       1984
996       1923
434       1921
276       1811
250.8     1629
599       1546
38        1484
584       1462
V57       1178
250.6     1147
820       1052
493       1039
577       1023
435        993
562        968
518        966
574        948
296        858
560        850
250.7      844
250.13     837
...        ...
363          1
314          1
838          1
870          1
637          1
691          1
523          1
347          1
605          1
148          1
373          1
543          1
974          1
871          1
542          1
V60          1
299          1
915          1
988          1
700          1
837          1
V07          1
839          1
649          1
911          1
61           1
98           1
10           1
704          1
160          1

[714 rows x 1 columns]
        diag_2
276       6539
428       6393
250       5867
427       4830
401       3631
496       3201
599       3196
403       2728
414       2570
411       2484
250.02    2028
707       1904
585       1802
584       1565
250.01    1496
491       1481
285       1477
780       1428
682       1393
425       1389
486       1274
518       1165
424       1036
413       1006
250.6      868
493        861
305        688
786        632
280        589
998        548
...        ...
E883         1
140          1
V50          1
35           1
250.31       1
E854         1
46           1
V69          1
182          1
302          1
879          1
615          1
523          1
E813         1
927          1
506          1
164          1
E918         1
E850         1
704          1
316          1
853          1
977          1
271          1
E817         1
880          1
963          1
974          1
347          1
163          1

[745 rows x 1 columns]
        diag_3
250      11169
401       8062
276       4947
428       4384
427       3752
414       3551
496       2486
403       2261
585       1913
272       1909
599       1864
V45       1368
?         1346
250.02    1326
707       1315
780       1297
285       1167
425       1097
250.6     1047
424       1014
305        899
250.01     885
584        871
682        852
518        748
41         708
278        672
493        671
530        606
786        572
...        ...
250.31       1
308          1
17           1
E883         1
111          1
657          1
365.44       1
838          1
57           1
669          1
E886         1
146          1
880          1
930          1
877          1
E900         1
732          1
684          1
944          1
879          1
66           1
374          1
377          1
884          1
814          1
E876         1
47           1
872          1
970          1
890          1

[785 rows x 1 columns]
    max_glu_serum
99          92712
1            2622
0            2549
    A1Cresult
99      81427
1       11599
0        4857
   metformin
0      78449
1      19434
   repaglinide
0        96366
1         1517
   nateglinide
0        97195
1          688
   chlorpropamide
0           97799
1              84
   glimepiride
0        92890
1         4993
   acetohexamide
0          97882
1              1
   glipizide
0      85560
1      12323
   glyburide
0      87583
1      10300
   tolbutamide
0        97863
1           20
   pioglitazone
0         90754
1          7129
   rosiglitazone
0          91691
1           6192
   acarbose
0     97582
1       301
   miglitol
0     97845
1        38
   troglitazone
0         97880
1             3
   tolazamide
0       97844
1          39
   insulin
1    52072
0    45811
   glyburide-metformin
0                97211
1                  672
   glipizide-metformin
0                97870
1                   13
   glimepiride-pioglitazone
0                     97882
1                         1
   metformin-rosiglitazone
0                    97883
   metformin-pioglitazone
0                   97882
1                       1
   change
0   52525
1   45358
   diabetesMed
1        75429
0        22454
     primary_diag
1.0         29212
0.0         17468
2.0         13766
3.0          9203
4.0          8504
5.0          6732
7.0          4963
6.0          4813
8.0          3222
     secondary_diag
1.0           30683
0.0           25968
4.0           12411
2.0           10285
7.0            8089
3.0            4020
8.0            2404
5.0            2311
6.0            1712
     additional_diag
0.0            29555
1.0            29157
4.0            16593
2.0             6951
7.0             6372
3.0             3809
5.0             1854
6.0             1851
8.0             1741

Normality

In [55]:
# Get the list of the numerical variable  
num_col = list(set(list(train._get_numeric_data().columns))- {'readmitted'})
num_col
Out[55]:
['num_lab_procedures',
 'number_emergency',
 'number_outpatient',
 'num_procedures',
 'number_diagnoses',
 'time_in_hospital',
 'patient_service',
 'number_inpatient',
 'med_change',
 'num_med',
 'num_medications']
In [63]:
# Plot the pair scatter plot to check the distribution 
sns.set()
cols = ['num_med',
 'number_emergency',
 'num_lab_procedures',
 'patient_service',
 'time_in_hospital',
 'med_change',
 'num_procedures',
 'number_diagnoses',
 'number_outpatient',
 'num_medications',
 'number_inpatient']
sns.pairplot(train[cols], size = 2.5)
plt.show();
In [56]:
# Check the skewness and kurtosis of the variables 
from scipy import stats
i=['num_med',
 'number_emergency',
 'num_lab_procedures',
 'patient_service',
 'time_in_hospital',
 'med_change',
 'num_procedures',
 'number_diagnoses',
 'number_outpatient',
 'num_medications',
 'number_inpatient']
print(train[i].skew())
print(train[i].kurt())
num_med                0.678066
number_emergency      22.649042
num_lab_procedures    -0.239627
patient_service        5.313745
time_in_hospital       1.137276
med_change             1.423876
num_procedures         1.324363
number_diagnoses      -0.891600
number_outpatient      8.790942
num_medications        1.328952
number_inpatient       3.609237
dtype: float64
num_med                  0.282313
number_emergency      1163.612749
num_lab_procedures      -0.255417
patient_service         67.142638
time_in_hospital         0.867396
med_change               1.435763
num_procedures           0.891609
number_diagnoses        -0.073727
number_outpatient      147.256600
num_medications          3.513028
number_inpatient        20.612171
dtype: float64
In [57]:
"""From the scatter plot, we found most of numerical are highly skewed and had high kurtosis.
Using the threshold=+-1 as skewness for normal distributioon, 
If skewness is less than -1 or greater than 1, the distribution is highly skewed.
If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.
If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.
As the standard for kurtosis, the threhold=3 is for normal distribution. 
So the variables that need to be transformed are: 1. number_emergency 2. patient_service 3.time_in_hospital 4. med_change\
5. num_procedures 6.num_outpatient 7.num_medications 8. num_inpatient"""
Out[57]:
'From the scatter plot, we found most of numerical are highly skewed and had high kurtosis.\nUsing the threshold=+-1 as skewness for normal distributioon, \nIf skewness is less than -1 or greater than 1, the distribution is highly skewed.\nIf skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed.\nIf skewness is between -0.5 and 0.5, the distribution is approximately symmetric.\nAs the standard for kurtosis, the threhold=3 is for normal distribution. \nSo the variables that need to be transformed are: 1. number_emergency 2. patient_service 3.time_in_hospital 4. med_change5. num_procedures 6.num_outpatient 7.num_medications 8. num_inpatient'

Log Transformation

In [58]:
# log transforamtion for the skewed numerical variables 
key=['num_med',
 'number_emergency',
 'num_lab_procedures',
 'patient_service',
 'time_in_hospital',
 'med_change',
 'num_procedures',
 'number_diagnoses',
 'number_outpatient',
 'num_medications',
 'number_inpatient']
for col in key:
    if abs(train[col].skew())>=1:
        train[col+"_log"]=np.log1p(train[col])
        print([col+"_log"],train[col+"_log"].skew())
['number_emergency_log'] 3.6291030366340697
['patient_service_log'] 1.1064806833594183
['time_in_hospital_log'] 0.10341360716227363
['med_change_log'] 1.1350977887575229
['num_procedures_log'] 0.523122171785963
['number_outpatient_log'] 2.72014768343293
['num_medications_log'] -0.4851017786134751
['number_inpatient_log'] 1.4377657193647482

Standarlization

In [59]:
"""We found different scale of the numeriable variables are in different scale which will cause
different weight to the analysis so we transform the variables into comparable scales"""

def standardize(data):
    return ((data - np.mean(data, axis = 0)) / np.std(data, axis = 0))
# num_col is a list of all numeric features
train[num_col] = standardize(train[num_col])

Correlation

In [60]:
#Check the correlation between variables 
train.drop(train.columns[train.columns.str.contains('unnamed',case = False)],axis = 1,inplace=True)
train_col =train.corr()
train_col
Out[60]:
time_in_hospital num_lab_procedures num_procedures num_medications number_outpatient number_emergency number_inpatient number_diagnoses readmitted patient_service med_change num_med number_emergency_log patient_service_log time_in_hospital_log med_change_log num_procedures_log number_outpatient_log num_medications_log number_inpatient_log
time_in_hospital 1.000000 0.319027 0.191581 0.464353 -0.010085 -0.009788 0.073775 0.221438 0.045744 0.031012 0.160688 0.075012 -0.007591 0.046722 0.957352 0.158472 0.206295 -0.017585 0.442712 0.088125
num_lab_procedures 0.319027 1.000000 0.052141 0.264021 -0.008678 -0.001382 0.039655 0.148904 0.023541 0.016461 0.117204 0.029475 0.003486 0.016322 0.334975 0.116351 0.037231 -0.021961 0.256277 0.043842
num_procedures 0.191581 0.052141 1.000000 0.381255 -0.025302 -0.038860 -0.066977 0.068461 -0.009974 -0.066690 0.010142 0.004754 -0.052583 -0.085881 0.165775 0.008742 0.960722 -0.033984 0.334835 -0.076668
num_medications 0.464353 0.264021 0.381255 1.000000 0.045077 0.013742 0.065715 0.258193 0.041800 0.066621 0.229139 0.227674 0.024949 0.078923 0.464633 0.229697 0.384716 0.054372 0.934708 0.073835
number_outpatient -0.010085 -0.008678 -0.025302 0.045077 1.000000 0.091286 0.106778 0.092998 0.019053 0.647440 0.029435 0.000264 0.130970 0.541844 -0.005612 0.031846 -0.019934 0.883733 0.057071 0.117053
number_emergency -0.009788 -0.001382 -0.038860 0.013742 0.091286 1.000000 0.266818 0.054425 0.060911 0.606273 0.053808 0.007505 0.821051 0.416974 -0.009312 0.056512 -0.036799 0.119275 0.022113 0.228602
number_inpatient 0.073775 0.039655 -0.066977 0.065715 0.106778 0.266818 1.000000 0.103145 0.167983 0.718069 0.065295 -0.029519 0.300245 0.737181 0.076423 0.069854 -0.057980 0.140890 0.086582 0.932273
number_diagnoses 0.221438 0.148904 0.068461 0.258193 0.092998 0.054425 0.103145 1.000000 0.052365 0.130277 0.073193 0.013646 0.079935 0.164029 0.238933 0.074707 0.067573 0.109666 0.302911 0.128250
readmitted 0.045744 0.023541 -0.009974 0.041800 0.019053 0.060911 0.167983 0.052365 1.000000 0.127823 0.036036 0.001086 0.073448 0.136116 0.051221 0.037281 -0.005725 0.029312 0.047921 0.159584
patient_service 0.031012 0.016461 -0.066690 0.066621 0.647440 0.606273 0.718069 0.130277 0.127823 1.000000 0.074161 -0.013019 0.573315 0.874832 0.035129 0.079104 -0.057939 0.613550 0.088137 0.670846
med_change 0.160688 0.117204 0.010142 0.229139 0.029435 0.053808 0.065295 0.073193 0.036036 0.074161 1.000000 0.313205 0.071196 0.081760 0.164337 0.993561 0.003654 0.032903 0.219853 0.067311
num_med 0.075012 0.029475 0.004754 0.227674 0.000264 0.007505 -0.029519 0.013646 0.001086 -0.013019 0.313205 1.000000 0.011189 -0.007874 0.087103 0.303922 -0.002330 0.002275 0.247924 -0.025970
number_emergency_log -0.007591 0.003486 -0.052583 0.024949 0.130970 0.821051 0.300245 0.079935 0.073448 0.573315 0.071196 0.011189 1.000000 0.526026 -0.005265 0.074485 -0.049316 0.168093 0.036227 0.274532
patient_service_log 0.046722 0.016322 -0.085881 0.078923 0.541844 0.416974 0.737181 0.164029 0.136116 0.874832 0.081760 -0.007874 0.526026 1.000000 0.051696 0.087050 -0.075029 0.627560 0.107867 0.792296
time_in_hospital_log 0.957352 0.334975 0.165775 0.464633 -0.005612 -0.009312 0.076423 0.238933 0.051221 0.035129 0.164337 0.087103 -0.005265 0.051696 1.000000 0.162489 0.186613 -0.012535 0.461728 0.091074
med_change_log 0.158472 0.116351 0.008742 0.229697 0.031846 0.056512 0.069854 0.074707 0.037281 0.079104 0.993561 0.303922 0.074485 0.087050 0.162489 1.000000 0.002280 0.036005 0.221155 0.071785
num_procedures_log 0.206295 0.037231 0.960722 0.384716 -0.019934 -0.036799 -0.057980 0.067573 -0.005725 -0.057939 0.003654 -0.002330 -0.049316 -0.075029 0.186613 0.002280 1.000000 -0.027277 0.350803 -0.067252
number_outpatient_log -0.017585 -0.021961 -0.033984 0.054372 0.883733 0.119275 0.140890 0.109666 0.029312 0.613550 0.032903 0.002275 0.168093 0.627560 -0.012535 0.036005 -0.027277 1.000000 0.069718 0.151159
num_medications_log 0.442712 0.256277 0.334835 0.934708 0.057071 0.022113 0.086582 0.302911 0.047921 0.088137 0.219853 0.247924 0.036227 0.107867 0.461728 0.221155 0.350803 0.069718 1.000000 0.098746
number_inpatient_log 0.088125 0.043842 -0.076668 0.073835 0.117053 0.228602 0.932273 0.128250 0.159584 0.670846 0.067311 -0.025970 0.274532 0.792296 0.091074 0.071785 -0.067252 0.151159 0.098746 1.000000
In [61]:
# Draw heatmap to plot the correlations
"""metformin-rosiglitazone is with all NAs"""
a = plt.subplots(figsize=(25, 15))
a = sns.heatmap(train_col, vmax=.8, square=True)
In [62]:
# find the top 15 correlated variables 
k = 15
cols = train_col.nlargest(k,'readmitted')['readmitted'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.5)
hm = plt.subplots(figsize=(25, 15))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
In [63]:
train.diabetesMed = train.diabetesMed.astype('int64')
train.change = train.change.astype('int64')

# convert data type of nominal features in dataframe to 'object' type for aggregating
i = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', \
          'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose','miglitol', \
          'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', \
          'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','A1Cresult']
train[i] = train[i].astype('int64')

train.dtypes
Out[63]:
race                         object
gender                       object
age                          object
admission_type_id            object
discharge_disposition_id     object
admission_source_id          object
time_in_hospital            float64
num_lab_procedures          float64
num_procedures              float64
num_medications             float64
number_outpatient           float64
number_emergency            float64
number_inpatient            float64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses            float64
max_glu_serum                object
A1Cresult                     int64
metformin                     int64
repaglinide                   int64
nateglinide                   int64
chlorpropamide                int64
glimepiride                   int64
acetohexamide                 int64
glipizide                     int64
glyburide                     int64
tolbutamide                   int64
pioglitazone                  int64
rosiglitazone                 int64
acarbose                      int64
miglitol                      int64
troglitazone                  int64
tolazamide                    int64
insulin                       int64
glyburide-metformin           int64
glipizide-metformin           int64
glimepiride-pioglitazone      int64
metformin-rosiglitazone       int64
metformin-pioglitazone        int64
change                        int64
diabetesMed                   int64
readmitted                    int64
patient_service             float64
med_change                  float64
num_med                     float64
primary_diag                 object
secondary_diag               object
additional_diag              object
number_emergency_log        float64
patient_service_log         float64
time_in_hospital_log        float64
med_change_log              float64
num_procedures_log          float64
number_outpatient_log       float64
num_medications_log         float64
number_inpatient_log        float64
dtype: object

Outlier

In [64]:
key=['num_med',
 'number_emergency',
 'num_lab_procedures',
 'patient_service',
 'time_in_hospital',
 'med_change',
 'num_procedures',
 'number_diagnoses',
 'number_outpatient',
 'num_medications',
 'number_inpatient']
train = train[(np.abs(sp.stats.zscore(train[key])) < 3).all(axis=1)]
In [65]:
train.columns
Out[65]:
Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'patient_service', 'med_change', 'num_med', 'primary_diag',
       'secondary_diag', 'additional_diag', 'number_emergency_log',
       'patient_service_log', 'time_in_hospital_log', 'med_change_log',
       'num_procedures_log', 'number_outpatient_log', 'num_medications_log',
       'number_inpatient_log'],
      dtype='object')
In [66]:
for i in train.columns:
    df=train[i].value_counts()
    print(df)
1    68090
2    17275
3     1818
0     1346
4      590
Name: race, dtype: int64
0    48181
1    40938
Name: gender, dtype: int64
75    23003
65    19716
85    15188
55    15112
45     8352
35     3286
95     2467
25     1321
15      560
5       114
Name: age, dtype: int64
1    63365
3    16483
5     9261
4       10
Name: admission_type_id, dtype: int64
1     65653
2     18408
18     4287
7       537
28      127
10       93
19        8
27        4
20        2
Name: discharge_disposition_id, dtype: int64
9     56717
1     27147
4      5239
8        11
11        5
Name: admission_source_id, dtype: int64
-0.467585    16055
-0.803582    15559
-1.139580    12937
-0.131587    12456
 0.204411     8808
 0.540408     6515
 0.876406     5012
 1.212404     3713
 1.548401     2487
 1.884399     1927
 2.220397     1510
 2.556394     1169
 2.892392      971
Name: time_in_hospital, dtype: int64
-2.134345    2923
 0.004121    2524
 0.055037    2177
 0.105953    2092
-0.250459    1985
-0.148627    1961
 0.156868    1933
-0.097711    1892
-0.199543    1890
-0.301374    1879
-0.046795    1877
 0.207784    1860
 0.309616    1809
 0.258700    1781
-0.352290    1744
-0.403206    1715
 0.411448    1691
 0.360532    1655
 0.564195    1597
 0.615111    1581
 0.513279    1579
 0.462364    1576
 0.666027    1539
-0.454122    1525
 0.716943    1487
 0.767859    1451
 0.920606    1397
 0.818775    1396
 0.869690    1362
 0.971522    1241
             ... 
 1.735260     341
-1.777934     330
-1.472439     310
-1.828850     297
 1.786176     279
-1.930682     261
-1.879766     259
 1.837092     251
 1.888008     249
 1.938923     215
 1.989839     177
 2.040755     169
 2.091671     126
 2.142587     108
 2.193503      91
 2.295334      73
 2.244419      64
 2.346250      49
 2.397166      48
 2.448082      47
 2.549914      44
 2.600830      34
 2.498998      33
 2.651746      30
 2.702661      23
 2.753577      21
 2.804493      16
 2.906325      11
 2.957241      10
 2.855409       6
Name: num_lab_procedures, Length: 101, dtype: int64
-0.784194    41435
-0.195382    18388
 0.393429    11164
 0.982241     8223
 2.748676     3790
 1.571053     3534
 2.159864     2585
Name: num_procedures, dtype: int64
-0.369407    5491
-0.493105    5461
-0.616803    5336
-0.122010    5200
-0.245708    5092
-0.740502    4950
 0.001688    4837
-0.864200    4546
 0.125387    4376
-0.987898    4006
 0.249085    3967
 0.372783    3532
-1.111597    3225
 0.496482    3194
 0.620180    2785
-1.235295    2490
 0.743878    2488
 0.867577    2066
-1.358993    1834
 0.991275    1790
 1.114973    1567
 1.238672    1347
-1.482692    1294
 1.362370    1193
 1.486068    1032
 1.609767     814
-1.606390     811
 1.733465     684
 1.857163     572
 1.980862     494
-1.730088     427
 2.104560     400
 2.228258     351
 2.351957     302
-1.853787     242
 2.599353     228
 2.475655     227
 2.723052     175
 2.846750     164
 2.970448     129
Name: num_medications, dtype: int64
-0.293251    75856
 0.492027     7418
 1.277304     3134
 2.062581     1762
 2.847859      949
Name: number_outpatient, dtype: int64
-0.213584    80330
 0.844533     6628
 1.902650     1614
 2.960768      547
Name: number_emergency, dtype: int64
-0.504065    60767
 0.283123    17449
 1.070312     6655
 1.857500     2904
 2.644688     1344
Name: number_inpatient, dtype: int64
428      6056
414      5553
786      3775
486      3043
410      3001
427      2526
715      1980
491      1936
780      1880
682      1804
434      1791
996      1749
276      1688
250.8    1485
599      1439
38       1339
584      1320
V57      1003
820      1000
250.6     960
562       936
435       934
493       915
577       898
574       893
518       859
560       786
296       752
440       744
250.7     722
         ... 
217         1
360         1
471         1
974         1
885         1
817         1
219         1
314         1
834         1
939         1
838         1
870         1
906         1
637         1
698         1
382         1
391         1
895         1
523         1
347         1
691         1
148         1
395         1
605         1
980         1
373         1
363         1
543         1
871         1
365         1
Name: diag_1, Length: 708, dtype: int64
276       5952
428       5795
250       5572
427       4441
401       3462
496       2944
599       2931
403       2429
414       2387
411       2180
250.02    1796
707       1683
585       1578
250.01    1392
285       1382
584       1369
780       1305
491       1304
425       1298
682       1240
486       1134
518       1011
424        956
413        916
493        776
250.6      761
305        628
786        578
280        558
998        502
          ... 
140          1
302          1
879          1
374          1
46           1
734          1
325          1
884          1
658          1
894          1
88           1
962          1
917          1
944          1
E968         1
963          1
974          1
E918         1
E850         1
725          1
656          1
347          1
250.31       1
880          1
E938         1
E818         1
E900         1
V50          1
523          1
704          1
Name: diag_2, Length: 739, dtype: int64
250       10535
401        7555
276        4452
428        3951
427        3385
414        3292
496        2247
403        1982
272        1817
599        1679
585        1667
V45        1262
780        1191
250.02     1151
707        1135
?          1076
285        1038
425        1015
250.6       927
424         925
305         847
250.01      815
584         759
682         756
41          665
518         636
278         615
493         604
530         561
786         535
          ...  
E853          1
361           1
622           1
377           1
485           1
970           1
872           1
47            1
E876          1
884           1
374           1
877           1
66            1
879           1
684           1
E900          1
148           1
671           1
395           1
841           1
750           1
853           1
E965          1
164           1
674           1
E915          1
430           1
7             1
844           1
967           1
Name: diag_3, Length: 781, dtype: int64
 0.815034    42512
-1.256610    10008
 0.297123     9396
-0.220788     9334
-0.738699     9219
-1.774522     5051
-2.292433     2625
-2.810344      927
 2.886679       15
 1.332946       15
 1.850857       10
 2.368768        7
Name: number_diagnoses, dtype: int64
99    84401
0      2383
1      2335
Name: max_glu_serum, dtype: int64
99    74516
1     10126
0      4477
Name: A1Cresult, dtype: int64
0    72481
1    16638
Name: metformin, dtype: int64
0    87874
1     1245
Name: repaglinide, dtype: int64
0    88546
1      573
Name: nateglinide, dtype: int64
0    89040
1       79
Name: chlorpropamide, dtype: int64
0    84868
1     4251
Name: glimepiride, dtype: int64
0    89119
Name: acetohexamide, dtype: int64
0    78423
1    10696
Name: glipizide, dtype: int64
0    80149
1     8970
Name: glyburide, dtype: int64
0    89099
1       20
Name: tolbutamide, dtype: int64
0    83193
1     5926
Name: pioglitazone, dtype: int64
0    83992
1     5127
Name: rosiglitazone, dtype: int64
0    88902
1      217
Name: acarbose, dtype: int64
0    89096
1       23
Name: miglitol, dtype: int64
0    89116
1        3
Name: troglitazone, dtype: int64
0    89082
1       37
Name: tolazamide, dtype: int64
1    45603
0    43516
Name: insulin, dtype: int64
0    88523
1      596
Name: glyburide-metformin, dtype: int64
0    89108
1       11
Name: glipizide-metformin, dtype: int64
0    89118
1        1
Name: glimepiride-pioglitazone, dtype: int64
0    89119
Name: metformin-rosiglitazone, dtype: int64
0    89119
Name: metformin-pioglitazone, dtype: int64
0    49759
1    39360
Name: change, dtype: int64
1    67747
0    21372
Name: diabetesMed, dtype: int64
0    79512
1     9607
Name: readmitted, dtype: int64
-0.526326    50536
-0.093359    18294
 0.339609     9278
 0.772577     5464
 1.205545     3269
 1.638513     1322
 2.071480      603
 2.504448      258
 2.937416       95
Name: patient_service, dtype: int64
-0.590221    66905
 1.456929    22214
Name: med_change, dtype: int64
-0.198754    41982
-1.283301    21372
 0.885793    19261
 1.970341     6504
Name: num_med, dtype: int64
1.0    26511
0.0    15882
2.0    12633
3.0     8566
4.0     7232
5.0     6215
7.0     4617
6.0     4482
8.0     2981
Name: primary_diag, dtype: int64
1.0    28034
0.0    23508
4.0    11437
2.0     9266
7.0     7322
3.0     3685
8.0     2208
5.0     2081
6.0     1578
Name: secondary_diag, dtype: int64
0.0    26799
1.0    26626
4.0    15342
2.0     6221
7.0     5703
3.0     3484
6.0     1690
5.0     1645
8.0     1609
Name: additional_diag, dtype: int64
0.000000    80330
0.693147     6628
1.098612     1614
1.386294      547
Name: number_emergency_log, dtype: int64
0.000000    50536
0.693147    18294
1.098612     9278
1.386294     5464
1.609438     3269
1.791759     1322
1.945910      603
2.079442      258
2.197225       95
Name: patient_service_log, dtype: int64
1.386294    16055
1.098612    15559
0.693147    12937
1.609438    12456
1.791759     8808
1.945910     6515
2.079442     5012
2.197225     3713
2.302585     2487
2.397895     1927
2.484907     1510
2.564949     1169
2.639057      971
Name: time_in_hospital_log, dtype: int64
0.000000    66905
0.693147    22214
Name: med_change_log, dtype: int64
0.000000    41435
0.693147    18388
1.098612    11164
1.386294     8223
1.945910     3790
1.609438     3534
1.791759     2585
Name: num_procedures_log, dtype: int64
0.000000    75856
0.693147     7418
1.098612     3134
1.386294     1762
1.609438      949
Name: number_outpatient_log, dtype: int64
2.639057    5491
2.564949    5461
2.484907    5336
2.772589    5200
2.708050    5092
2.397895    4950
2.833213    4837
2.302585    4546
2.890372    4376
2.197225    4006
2.944439    3967
2.995732    3532
2.079442    3225
3.044522    3194
3.091042    2785
1.945910    2490
3.135494    2488
3.178054    2066
1.791759    1834
3.218876    1790
3.258097    1567
3.295837    1347
1.609438    1294
3.332205    1193
3.367296    1032
3.401197     814
1.386294     811
3.433987     684
3.465736     572
3.496508     494
1.098612     427
3.526361     400
3.555348     351
3.583519     302
0.693147     242
3.637586     228
3.610918     227
3.663562     175
3.688879     164
3.713572     129
Name: num_medications_log, dtype: int64
0.000000    60767
0.693147    17449
1.098612     6655
1.386294     2904
1.609438     1344
Name: number_inpatient_log, dtype: int64
In [67]:
train['primary_diag'] = train['primary_diag'].astype('int')
train_v = pd.get_dummies(train, columns=['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                                      'admission_source_id', 'max_glu_serum', 'A1Cresult', 'primary_diag'], drop_first = True)
In [68]:
nom_cols = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'max_glu_serum', 'A1Cresult', 'primary_diag' ]
In [69]:
num_cols = list(set(list(train._get_numeric_data().columns))- {'readmitted', 'change'})
num_cols
Out[69]:
['number_emergency',
 'number_outpatient',
 'number_diagnoses',
 'time_in_hospital',
 'time_in_hospital_log',
 'glipizide-metformin',
 'miglitol',
 'tolazamide',
 'acetohexamide',
 'num_lab_procedures',
 'patient_service_log',
 'num_procedures',
 'pioglitazone',
 'acarbose',
 'patient_service',
 'med_change_log',
 'chlorpropamide',
 'med_change',
 'number_inpatient_log',
 'glipizide',
 'num_medications_log',
 'metformin-rosiglitazone',
 'A1Cresult',
 'repaglinide',
 'glimepiride-pioglitazone',
 'number_inpatient',
 'troglitazone',
 'number_emergency_log',
 'num_med',
 'metformin',
 'diabetesMed',
 'num_medications',
 'tolbutamide',
 'rosiglitazone',
 'glyburide',
 'primary_diag',
 'insulin',
 'num_procedures_log',
 'metformin-pioglitazone',
 'glyburide-metformin',
 'number_outpatient_log',
 'glimepiride',
 'nateglinide']
In [70]:
# Append all columns with dummy variables 
nom_cols_new = []
for i in nom_cols:
    for j in train_v.columns:
        if i in j:
            nom_cols_new.append(j)
In [71]:
nom_cols_new
Out[71]:
['race_1',
 'race_2',
 'race_3',
 'race_4',
 'gender_1',
 'admission_type_id_3',
 'admission_type_id_4',
 'admission_type_id_5',
 'discharge_disposition_id_2',
 'discharge_disposition_id_7',
 'discharge_disposition_id_10',
 'discharge_disposition_id_18',
 'discharge_disposition_id_19',
 'discharge_disposition_id_20',
 'discharge_disposition_id_27',
 'discharge_disposition_id_28',
 'admission_source_id_4',
 'admission_source_id_8',
 'admission_source_id_9',
 'admission_source_id_11',
 'max_glu_serum_1',
 'max_glu_serum_99',
 'A1Cresult_1',
 'A1Cresult_99',
 'primary_diag_1',
 'primary_diag_2',
 'primary_diag_3',
 'primary_diag_4',
 'primary_diag_5',
 'primary_diag_6',
 'primary_diag_7',
 'primary_diag_8']
In [72]:
train_v.columns
Out[72]:
Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'patient_service', 'med_change', 'num_med', 'secondary_diag',
       'additional_diag', 'number_emergency_log', 'patient_service_log',
       'time_in_hospital_log', 'med_change_log', 'num_procedures_log',
       'number_outpatient_log', 'num_medications_log', 'number_inpatient_log',
       'race_1', 'race_2', 'race_3', 'race_4', 'gender_1',
       'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5',
       'discharge_disposition_id_2', 'discharge_disposition_id_7',
       'discharge_disposition_id_10', 'discharge_disposition_id_18',
       'discharge_disposition_id_19', 'discharge_disposition_id_20',
       'discharge_disposition_id_27', 'discharge_disposition_id_28',
       'admission_source_id_4', 'admission_source_id_8',
       'admission_source_id_9', 'admission_source_id_11', 'max_glu_serum_1',
       'max_glu_serum_99', 'A1Cresult_1', 'A1Cresult_99', 'primary_diag_1',
       'primary_diag_2', 'primary_diag_3', 'primary_diag_4', 'primary_diag_5',
       'primary_diag_6', 'primary_diag_7', 'primary_diag_8'],
      dtype='object')
In [73]:
# Create a feature set
feature_set= ['race_1','race_2','race_3','race_4','gender_1','age','admission_type_id_3',
       'admission_type_id_4','admission_type_id_5','discharge_disposition_id_2', 
'discharge_disposition_id_7','discharge_disposition_id_10','discharge_disposition_id_18','discharge_disposition_id_19',
        'discharge_disposition_id_20','discharge_disposition_id_27','discharge_disposition_id_28','admission_source_id_4', 'admission_source_id_8',
        'admission_source_id_9','admission_source_id_11','num_lab_procedures','number_diagnoses', 'max_glu_serum_1', 'max_glu_serum_99', 'A1Cresult_1', 'A1Cresult_99', 
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 
        'num_med', 'primary_diag_1','primary_diag_2','primary_diag_3','primary_diag_4','primary_diag_5','primary_diag_6','primary_diag_7','primary_diag_8',
        'number_emergency_log','patient_service_log', 'time_in_hospital_log', 'med_change_log',
       'num_procedures_log', 'number_outpatient_log', 'num_medications_log',
       'number_inpatient_log']
In [74]:
train_v.to_csv('./modified_diabetes1205_V2.csv',index=None)
In [83]:
train_v = pd.read_csv('./modified_diabetes1205_V2.csv')
In [75]:
train_v.columns
Out[75]:
Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'patient_service', 'med_change', 'num_med', 'secondary_diag',
       'additional_diag', 'number_emergency_log', 'patient_service_log',
       'time_in_hospital_log', 'med_change_log', 'num_procedures_log',
       'number_outpatient_log', 'num_medications_log', 'number_inpatient_log',
       'race_1', 'race_2', 'race_3', 'race_4', 'gender_1',
       'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5',
       'discharge_disposition_id_2', 'discharge_disposition_id_7',
       'discharge_disposition_id_10', 'discharge_disposition_id_18',
       'discharge_disposition_id_19', 'discharge_disposition_id_20',
       'discharge_disposition_id_27', 'discharge_disposition_id_28',
       'admission_source_id_4', 'admission_source_id_8',
       'admission_source_id_9', 'admission_source_id_11', 'max_glu_serum_1',
       'max_glu_serum_99', 'A1Cresult_1', 'A1Cresult_99', 'primary_diag_1',
       'primary_diag_2', 'primary_diag_3', 'primary_diag_4', 'primary_diag_5',
       'primary_diag_6', 'primary_diag_7', 'primary_diag_8'],
      dtype='object')

Modeling

In [76]:
train_input = train_v[feature_set]
train_output = train_v['readmitted']
In [77]:
# Check the class for the outcome variable 
target_count=train_v['readmitted'].value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Count (target)')
Class 0: 79512
Class 1: 9607
Proportion: 8.28 : 1
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a34f1cd30>

Logistic Regression-Benchmark & Class Imbalance

In [78]:
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#from xgboost.sklearn import XGBClassifier 
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

sns.set(style='white', context='notebook', palette='deep')
In [79]:
# Split the training and testing dataset 
x_train, x_test, y_train, y_test= model_selection.train_test_split(train_input, train_output, random_state = 0, test_size=0.1)
In [80]:
import numpy as np
import pandas as pd

target_count = train_output.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Count (target)');
Class 0: 79512
Class 1: 9607
Proportion: 8.28 : 1
In [81]:
LogR = LogisticRegression(fit_intercept=True, penalty='l1',random_state = 0)
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(LogR, x_train, y_train, cv=10))))
LogR.fit(x_train, y_train)
print("Test Set score: {:.2%}".format(LogR.score(x_test, y_test)))
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Cross Validation Score: 89.22%
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Test Set score: 89.12%
In [82]:
y_pred = LogR.predict(x_test)
In [83]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, y_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, y_pred)))
print("Recall is {0:.2f}".format(recall_score(y_test, y_pred)))
Accuracy is 0.89
Precision is 0.00
Recall is 0.00
In [84]:
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(y_pred, name = 'Predict'), margins = True)
Out[84]:
Predict 0 All
Actual
0 705 705
1 103 103
All 808 808
In [85]:
# Confusion Matrix and Class imbalance cehcking 
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(y_test, y_pred)
print(cfm)
labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cfm, cmap=plt.cm.Reds)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()
[[7942    1]
 [ 969    0]]
In [86]:
# Class Imbalance Processing: Using re-sampling method
from imblearn.over_sampling import SMOTE
from collections import Counter
print('Original dataset shape {}'.format(Counter(train_output)))
sm = SMOTE(random_state=20)
train_input_new, train_output_new = sm.fit_sample(train_input, train_output)
print('New dataset shape {}'.format(Counter(train_output_new)))
Original dataset shape Counter({0: 79512, 1: 9607})
New dataset shape Counter({0: 79512, 1: 79512})
In [87]:
train_input_new = pd.DataFrame(train_input_new, columns = list(train_input.columns))
In [88]:
pd.concat([pd.DataFrame(train_input_new), pd.DataFrame(train_output_new)],axis=1).to_csv('./smote_data.csv',header=None)
In [89]:
# Again,split the training and testing dataset 
x_train_new, x_test_new, y_train_new, y_test_new= model_selection.train_test_split(train_input_new, train_output_new, random_state = 0, test_size=0.1)
In [92]:
# Run Logistic Regression Again 
LogR = LogisticRegression(fit_intercept=True, penalty='l1',random_state = 0)
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(LogR, x_train_new, y_train_new, cv=10))))
LogR.fit(x_train_new, y_train_new)
print("Test Set score: {:.2%}".format(LogR.score(x_test_new, y_test_new)))
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Cross Validation Score: 61.29%
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Test Set score: 61.37%
In [93]:
y_test_predict = LogR.predict(x_test_new)
In [94]:
# Confusion Matrix and Class imbalance cehcking 
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(y_test_new, y_test_predict)
print(cfm)
labels = ['Class 0', 'Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cfm, cmap=plt.cm.Reds)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()
[[5010 2826]
 [3318 4749]]
In [95]:
pd.crosstab(pd.Series(y_test_new, name = 'Actual'), pd.Series(y_test_predict, name = 'Predict'), margins = True)
Out[95]:
Predict 0 1 All
Actual
0 5010 2826 7836
1 3318 4749 8067
All 8328 7575 15903
In [96]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, y_test_predict)))
print("Precision is {0:.2f}".format(precision_score(y_test_new, y_test_predict)))
print("Recall is {0:.2f}".format(recall_score(y_test_new, y_test_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, y_test_predict)))

accuracy_logreg = accuracy_score(y_test_new, y_test_predict)
precision_logreg = precision_score(y_test_new, y_test_predict)
recall_logreg = recall_score(y_test_new, y_test_predict)
auc_logreg = roc_auc_score(y_test_new, y_test_predict)
Accuracy is 0.61
Precision is 0.63
Recall is 0.59
AUC is 0.61

Decision Tree

Decision Tree-Entropy

In [97]:
feature_set_dec = ['age', 'time_in_hospital', 'num_procedures', 'num_medications', 'number_outpatient_log', 
                 'number_emergency_log', 'number_inpatient_log', 'number_diagnoses', 'metformin', 
                 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 
                 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
                 'tolazamide', 'insulin', 'glyburide-metformin',
                 'race_1', 'race_2', 'race_3', 
                 'race_4', 'gender_1', 
                 'admission_source_id_4', 'admission_source_id_8', 'admission_source_id_9','admission_source_id_11', 
                 'discharge_disposition_id_2', 'discharge_disposition_id_7', 
                 'discharge_disposition_id_10', 'discharge_disposition_id_18', 
                 'max_glu_serum_1', 'max_glu_serum_99', 'A1Cresult_1', 'A1Cresult_99', 
                 'primary_diag_1','primary_diag_2','primary_diag_3','primary_diag_4','primary_diag_5','primary_diag_6','primary_diag_7','primary_diag_8']
In [98]:
train_input = train_v[feature_set_dec]
train_output = train_v['readmitted']
train_v['readmitted'].value_counts()
Out[98]:
0    79512
1     9607
Name: readmitted, dtype: int64
In [99]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print('Original dataset shape {}'.format(Counter(train_output)))
smt = SMOTE(random_state=20)
train_input_new, train_output_new = smt.fit_sample(train_input, train_output)
print('New dataset shape {}'.format(Counter(train_output_new)))
train_input_new = pd.DataFrame(train_input_new, columns = list(train_input.columns))
x_train_new, x_test_new, y_train_new, y_test_new = train_test_split(train_input_new, train_output_new, test_size=0.20, random_state=0)
Original dataset shape Counter({0: 79512, 1: 9607})
New dataset shape Counter({0: 79512, 1: 79512})
In [100]:
from sklearn.tree import DecisionTreeClassifier
dte = DecisionTreeClassifier(max_depth=28, criterion = "entropy", min_samples_split=10)
print("Cross Validation score: {:.2%}".format(np.mean(cross_val_score(dte, x_train_new, y_train_new, cv=10))))
dte.fit(x_train_new, y_train_new)
print("Dev Set score: {:.2%}".format(dte.score(x_test_new, y_test_new)))
Cross Validation score: 88.98%
Dev Set score: 89.36%
In [101]:
y_test_predict = dte.predict(x_test_new)
In [102]:
pd.crosstab(pd.Series(y_test_new, name = 'Actual'), pd.Series(y_test_predict, name = 'Predict'), margins = True)
Out[102]:
Predict 0 1 All
Actual
0 14468 1217 15685
1 2168 13952 16120
All 16636 15169 31805
In [103]:
# Calculate Accuracy, precision, reacall and AUC with classificaiton-entropy
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, y_test_predict)))
print("Precision is {0:.2f}".format(precision_score(y_test_new, y_test_predict)))
print("Recall is {0:.2f}".format(recall_score(y_test_new, y_test_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, y_test_predict)))

accuracy_dte = accuracy_score(y_test_new, y_test_predict)
precision_dte = precision_score(y_test_new, y_test_predict)
recall_dte = recall_score(y_test_new, y_test_predict)
auc_dte = roc_auc_score(y_test_new, y_test_predict)
Accuracy is 0.89
Precision is 0.92
Recall is 0.87
AUC is 0.89

Decision Tree-Gini

In [104]:
# Calculate
from sklearn.tree import DecisionTreeClassifier
dte = DecisionTreeClassifier(max_depth=28, criterion = "gini", min_samples_split=10)
print("Cross Validation score: {:.2%}".format(np.mean(cross_val_score(dte, x_train_new, y_train_new, cv=10))))
dte.fit(x_train_new, y_train_new)
print("Dev Set score: {:.2%}".format(dte.score(x_test_new, y_test_new)))
Cross Validation score: 89.13%
Dev Set score: 89.39%
In [105]:
y_test_predict = dte.predict(x_test_new)
In [106]:
pd.crosstab(pd.Series(y_test_new, name = 'Actual'), pd.Series(y_test_predict, name = 'Predict'), margins = True)
Out[106]:
Predict 0 1 All
Actual
0 14448 1237 15685
1 2139 13981 16120
All 16587 15218 31805
In [107]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, y_test_predict)))
print("Precision is {0:.2f}".format(precision_score(y_test_new, y_test_predict)))
print("Recall is {0:.2f}".format(recall_score(y_test_new, y_test_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, y_test_predict)))

accuracy_dtg = accuracy_score(y_test_new, y_test_predict)
precision_dtg = precision_score(y_test_new, y_test_predict)
recall_dtg = recall_score(y_test_new, y_test_predict)
auc_dtg = roc_auc_score(y_test_new, y_test_predict)
Accuracy is 0.89
Precision is 0.92
Recall is 0.87
AUC is 0.89
In [108]:
# Create list of top most features based on importance
feature_names = x_train_new.columns
feature_imports = dte.feature_importances_
most_imp_features = pd.DataFrame([f for f in zip(feature_names,feature_imports)], columns=["Feature", "Importance"]).nlargest(10, "Importance")
most_imp_features.sort_values(by="Importance", inplace=True)
print(most_imp_features)
plt.figure(figsize=(10,6))
plt.barh(range(len(most_imp_features)), most_imp_features.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp_features)), most_imp_features.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Most important features - Decision Tree')
plt.show()
                       Feature  Importance
19                     insulin    0.022728
4        number_outpatient_log    0.024900
8                    metformin    0.025077
25                    gender_1    0.029290
3              num_medications    0.041278
7             number_diagnoses    0.041326
2               num_procedures    0.047697
30  discharge_disposition_id_2    0.065841
1             time_in_hospital    0.254720
6         number_inpatient_log    0.268297
In [109]:
import graphviz
from IPython.display import Image
import pydotplus
from sklearn import tree
dot_dt_q2 = tree.export_graphviz(dte, out_file="dt_q2.dot", feature_names=x_train_new.columns, max_depth=2,
                                 class_names=["No","Readm"], filled=True, rounded=True, special_characters=True)
graph_dt_q2 = pydotplus.graph_from_dot_file('dt_q2.dot')
Image(graph_dt_q2.create_png())
Out[109]:

Random Forest

In [110]:
from sklearn.ensemble import RandomForestClassifier
forrest = RandomForestClassifier(n_estimators = 10, max_depth=25, criterion = "entropy", min_samples_split=10)
print("Cross Validation score: {:.2%}".format(np.mean(cross_val_score(forrest, x_train_new, y_train_new, cv=10))))
forrest.fit(x_train_new, y_train_new)
print("Dev Set score: {:.2%}".format(forrest.score(x_test_new, y_test_new)))
Cross Validation score: 92.23%
Dev Set score: 92.46%
In [111]:
y_test_predict = forrest.predict(x_test_new)
pd.crosstab(pd.Series(y_test_new, name = 'Actual'), pd.Series(y_test_predict, name = 'Predict'), margins = True)
Out[111]:
Predict 0 1 All
Actual
0 15426 259 15685
1 2140 13980 16120
All 17566 14239 31805
In [112]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, y_test_predict)))
print("Precision is {0:.2f}".format(precision_score(y_test_new, y_test_predict)))
print("Recall is {0:.2f}".format(recall_score(y_test_new, y_test_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, y_test_predict)))

accuracy_forreste = accuracy_score(y_test_new, y_test_predict)
precision_forreste = precision_score(y_test_new, y_test_predict)
recall_forreste = recall_score(y_test_new, y_test_predict)
auc_forreste = roc_auc_score(y_test_new, y_test_predict)
Accuracy is 0.92
Precision is 0.98
Recall is 0.87
AUC is 0.93
In [113]:
forrest = RandomForestClassifier(n_estimators = 10, max_depth=25, criterion = "gini", min_samples_split=10)
print("Cross Validation score: {:.2%}".format(np.mean(cross_val_score(forrest, x_train_new, y_train_new, cv=10))))
forrest.fit(x_train_new, y_train_new)
print("Dev Set score: {:.2%}".format(forrest.score(x_test_new, y_test_new)))
Cross Validation score: 92.30%
Dev Set score: 92.64%
In [114]:
y_test_predict = forrest.predict(x_test_new)
pd.crosstab(pd.Series(y_test_new, name = 'Actual'), pd.Series(y_test_predict, name = 'Predict'), margins = True)
Out[114]:
Predict 0 1 All
Actual
0 15367 318 15685
1 2023 14097 16120
All 17390 14415 31805
In [115]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, y_test_predict)))
print("Precision is {0:.2f}".format(precision_score(y_test_new, y_test_predict)))
print("Recall is {0:.2f}".format(recall_score(y_test_new, y_test_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, y_test_predict)))

accuracy_forrestg = accuracy_score(y_test_new, y_test_predict)
precision_forrestg = precision_score(y_test_new, y_test_predict)
recall_forrestg = recall_score(y_test_new, y_test_predict)
auc_forrestg = roc_auc_score(y_test_new, y_test_predict)
Accuracy is 0.93
Precision is 0.98
Recall is 0.87
AUC is 0.93
In [116]:
# Create list of top most features based on importance
feature_names = x_train_new.columns
feature_imports = forrest.feature_importances_
most_imp_features = pd.DataFrame([f for f in zip(feature_names,feature_imports)], columns=["Feature", "Importance"]).nlargest(10, "Importance")
most_imp_features.sort_values(by="Importance", inplace=True)
plt.figure(figsize=(10,6))
plt.barh(range(len(most_imp_features)), most_imp_features.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp_features)), most_imp_features.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Most important features - Random Forest (gini)')
plt.show()
In [117]:
impforrest = [(x[1], x[0]) for x in sorted(zip(list(forrest.feature_importances_), list(train_input.columns)), reverse = True)]
print("Random Forest Feature Importance in decreasing order")
impforrest = pd.DataFrame(impforrest, columns=["Feature","Importance"])
impforrest
Random Forest Feature Importance in decreasing order
Out[117]:
Feature Importance
0 number_inpatient_log 0.142575
1 time_in_hospital 0.091733
2 number_diagnoses 0.074642
3 insulin 0.059118
4 num_procedures 0.056511
5 discharge_disposition_id_2 0.055647
6 metformin 0.051825
7 num_medications 0.049705
8 gender_1 0.036074
9 number_outpatient_log 0.035325
10 admission_source_id_9 0.031500
11 primary_diag_1 0.028619
12 number_emergency_log 0.026847
13 age 0.024272
14 A1Cresult_99 0.023960
15 glipizide 0.022664
16 glyburide 0.020515
17 rosiglitazone 0.018923
18 pioglitazone 0.017128
19 A1Cresult_1 0.014035
20 primary_diag_3 0.013171
21 primary_diag_2 0.012294
22 race_1 0.011138
23 glimepiride 0.009056
24 primary_diag_4 0.008125
25 primary_diag_5 0.008122
26 admission_source_id_4 0.007941
27 race_2 0.007541
28 primary_diag_7 0.007139
29 discharge_disposition_id_18 0.006776
30 max_glu_serum_99 0.005543
31 primary_diag_6 0.005106
32 primary_diag_8 0.004038
33 max_glu_serum_1 0.002472
34 repaglinide 0.002404
35 race_3 0.002162
36 discharge_disposition_id_7 0.001253
37 nateglinide 0.001137
38 glyburide-metformin 0.000914
39 race_4 0.000886
40 discharge_disposition_id_10 0.000580
41 acarbose 0.000379
42 chlorpropamide 0.000092
43 tolazamide 0.000081
44 admission_source_id_8 0.000028
45 admission_source_id_11 0.000000

Modeling Improvement

XGBOOST-Tuning

In [131]:
!conda install -c conda-forge xgboost --yes
Solving environment: done

## Package Plan ##

  environment location: /anaconda3

  added / updated specs: 
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    xgboost-0.80               |   py36hfc679d8_1           9 KB  conda-forge
    llvm-openmp-4.0.1          |       hcfea43d_1         454 KB
    certifi-2018.4.16          |           py36_0         142 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    py-xgboost-0.80            |   py36hfc679d8_1          62 KB  conda-forge
    libxgboost-0.80            |       hfc679d8_1         2.0 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.7 MB

The following NEW packages will be INSTALLED:

    _py-xgboost-mutex: 2.0-cpu_0           conda-forge
    libxgboost:        0.80-hfc679d8_1     conda-forge
    llvm-openmp:       4.0.1-hcfea43d_1               
    py-xgboost:        0.80-py36hfc679d8_1 conda-forge
    xgboost:           0.80-py36hfc679d8_1 conda-forge

The following packages will be UPDATED:

    certifi:           2018.4.16-py36_0                --> 2018.4.16-py36_0 conda-forge
    conda:             4.5.11-py36_0                   --> 4.5.11-py36_1000 conda-forge


Downloading and Extracting Packages
xgboost-0.80         | 9 KB      | ##################################### | 100% 
llvm-openmp-4.0.1    | 454 KB    | ##################################### | 100% 
certifi-2018.4.16    | 142 KB    | ##################################### | 100% 
_py-xgboost-mutex-2. | 8 KB      | ##################################### | 100% 
py-xgboost-0.80      | 62 KB     | ##################################### | 100% 
libxgboost-0.80      | 2.0 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
In [132]:
import pandas as pd
import xgboost as xgb
import operator

parameters:

1. eta
2. max_depth
3. colsample_bytree   

  • The tuning process includes the above three parameters: we set eta large firstly and we tune max_depth firstly then colsample_bytree
    after we make sure them we can tune eta and num_rounds together
max_depth AUC
3 0.94
4 0.94
5 0.94
6 0.94
7 0.94
8 0.94
9 0.94
In [142]:
for max_depth in [3,4,5,6,7,8,9]:
    xgb_params = {'booster':'gbtree','objective': 'binary:logistic', "eta": 0.2, \
                  "max_depth": max_depth,"colsample_bytree":0.7, 'silent':1}
    num_rounds = 20000

    dtrain = xgb.DMatrix(x_train_new, label=y_train_new)
    dtest= xgb.DMatrix(x_test_new, label=y_test_new)
    evallist = [(dtrain, 'train'), (dtest, 'test')]
    gbm = xgb.train(xgb_params.items(), dtrain, num_rounds, evals=evallist,verbose_eval=100,early_stopping_rounds=100)
    y_predict = gbm.predict(xgb.DMatrix(x_test_new))
[0]	train-error:0.311314	test-error:0.314447
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.068748	test-error:0.069077
[200]	train-error:0.064511	test-error:0.064518
[300]	train-error:0.062711	test-error:0.063103
[400]	train-error:0.061948	test-error:0.062412
[500]	train-error:0.061406	test-error:0.062034
[600]	train-error:0.061052	test-error:0.06172
[700]	train-error:0.060754	test-error:0.061468
[800]	train-error:0.060494	test-error:0.061248
[900]	train-error:0.060345	test-error:0.061154
Stopping. Best iteration:
[858]	train-error:0.060408	test-error:0.061154

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.310347	test-error:0.313378
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.064762	test-error:0.065084
[200]	train-error:0.062066	test-error:0.0626
[300]	train-error:0.061288	test-error:0.062097
[400]	train-error:0.060541	test-error:0.061657
[500]	train-error:0.060085	test-error:0.061185
[600]	train-error:0.05974	test-error:0.061091
Stopping. Best iteration:
[590]	train-error:0.059787	test-error:0.06106

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.265558	test-error:0.272253
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.063332	test-error:0.063921
[200]	train-error:0.061288	test-error:0.062003
[300]	train-error:0.060211	test-error:0.061783
[400]	train-error:0.059299	test-error:0.061688
[500]	train-error:0.058309	test-error:0.061594
[600]	train-error:0.057515	test-error:0.061563
Stopping. Best iteration:
[556]	train-error:0.057877	test-error:0.061405

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.262783	test-error:0.269172
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.062074	test-error:0.062852
[200]	train-error:0.060203	test-error:0.061783
[300]	train-error:0.05834	test-error:0.061531
Stopping. Best iteration:
[268]	train-error:0.059024	test-error:0.061343

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.227482	test-error:0.231976
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.061217	test-error:0.062286
[200]	train-error:0.058474	test-error:0.061783
[300]	train-error:0.055031	test-error:0.061814
Stopping. Best iteration:
[206]	train-error:0.05823	test-error:0.061688

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.219723	test-error:0.226002
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.06018	test-error:0.062003
[200]	train-error:0.055322	test-error:0.06194
Stopping. Best iteration:
[146]	train-error:0.058152	test-error:0.061437

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.192047	test-error:0.197579
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.058749	test-error:0.061971
[200]	train-error:0.050755	test-error:0.06194
Stopping. Best iteration:
[172]	train-error:0.053239	test-error:0.0615

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
colsample_bytree AUC
0.6 0.94
0.7 0.94
0.8 0.94
0.9 0.94
1 0.94
In [47]:
for colsample_bytree in [0.6, 0.7, 0.8, 0.9, 1]:
    xgb_params = {'booster':'gbtree','objective': 'binary:logistic', "eta": 0.2, \
                  "max_depth": 8,"colsample_bytree":colsample_bytree, 'silent':1}
    num_rounds = 20000

    dtrain = xgb.DMatrix(x_train_new, label=y_train_new)
    dtest= xgb.DMatrix(x_test_new, label=y_test_new)
    evallist = [(dtrain, 'train'), (dtest, 'test')]
    gbm = xgb.train(xgb_params.items(), dtrain, num_rounds, evals=evallist,verbose_eval=100,early_stopping_rounds=100)
    y_predict = gbm.predict(xgb.DMatrix(x_test_new))
[0]	train-error:0.198937	test-error:0.198768
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.059907	test-error:0.06068
[200]	train-error:0.055072	test-error:0.061058
Stopping. Best iteration:
[141]	train-error:0.058203	test-error:0.060366

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.199167	test-error:0.198893
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.059649	test-error:0.060303
[200]	train-error:0.054304	test-error:0.061246
Stopping. Best iteration:
[117]	train-error:0.058887	test-error:0.060177

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.199328	test-error:0.199271
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.059635	test-error:0.060617
[200]	train-error:0.053898	test-error:0.060869
Stopping. Best iteration:
[128]	train-error:0.0583	test-error:0.060429

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.201613	test-error:0.19883
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.059502	test-error:0.060555
[200]	train-error:0.053179	test-error:0.060806
Stopping. Best iteration:
[113]	train-error:0.059111	test-error:0.060366

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.201613	test-error:0.19883
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.059334	test-error:0.060429
[200]	train-error:0.05306	test-error:0.06068
Stopping. Best iteration:
[121]	train-error:0.058147	test-error:0.06024

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
eta AUC
0.01 0.94
0.02 0.94
0.05 0.94
In [48]:
for eta in [0.01, 0.02, 0.05]: 
    xgb_params = {'booster':'gbtree','objective': 'binary:logistic', "eta": eta, \
                  "max_depth": 8,"colsample_bytree":0.9,  'silent':1}
    num_rounds = 20000

    dtrain = xgb.DMatrix(x_train_new, label=y_train_new)
    dtest= xgb.DMatrix(x_test_new, label=y_test_new)
    evallist = [(dtrain, 'train'), (dtest, 'test')]
    gbm = xgb.train(xgb_params.items(), dtrain, num_rounds, evals=evallist,verbose_eval=100,early_stopping_rounds=100)
    y_predict = gbm.predict(xgb.DMatrix(x_test_new))
[0]	train-error:0.201613	test-error:0.19883
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.135075	test-error:0.131107
[200]	train-error:0.09928	test-error:0.099227
[300]	train-error:0.078724	test-error:0.077658
[400]	train-error:0.070486	test-error:0.069924
[500]	train-error:0.066671	test-error:0.066277
[600]	train-error:0.06491	test-error:0.064642
[700]	train-error:0.06375	test-error:0.063573
[800]	train-error:0.062758	test-error:0.062315
[900]	train-error:0.062129	test-error:0.061812
[1000]	train-error:0.061766	test-error:0.061624
[1100]	train-error:0.061521	test-error:0.061561
[1200]	train-error:0.061326	test-error:0.061435
[1300]	train-error:0.061193	test-error:0.061246
[1400]	train-error:0.06106	test-error:0.061058
Stopping. Best iteration:
[1371]	train-error:0.061109	test-error:0.061058

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.201613	test-error:0.19883
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.098497	test-error:0.097529
[200]	train-error:0.070157	test-error:0.069547
[300]	train-error:0.065064	test-error:0.064768
[400]	train-error:0.062737	test-error:0.06263
[500]	train-error:0.061731	test-error:0.061624
[600]	train-error:0.06134	test-error:0.061372
[700]	train-error:0.061074	test-error:0.061183
[800]	train-error:0.060753	test-error:0.060806
[900]	train-error:0.060424	test-error:0.06068
[1000]	train-error:0.060075	test-error:0.060617
Stopping. Best iteration:
[947]	train-error:0.060271	test-error:0.060617

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
[0]	train-error:0.201613	test-error:0.19883
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 100 rounds.
[100]	train-error:0.066615	test-error:0.066214
[200]	train-error:0.061759	test-error:0.061624
[300]	train-error:0.060718	test-error:0.060932
[400]	train-error:0.059984	test-error:0.06068
[500]	train-error:0.058999	test-error:0.060429
[600]	train-error:0.057811	test-error:0.060366
[700]	train-error:0.056463	test-error:0.060303
Stopping. Best iteration:
[610]	train-error:0.057692	test-error:0.06024

Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
In [145]:
# Using XGBoosting to find the strongest predictors 
from xgboost import plot_importance
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
# figure(num=None, figsize=(20,10), dpi=200, facecolor='w', edgecolor='k')
# plt.figure(figsize=(40,20))
ax = xgb.plot_importance(gbm, max_num_features=10)
fig = ax.figure
fig.set_size_inches(40,20)
In [141]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(y_test_new, np.round(y_predict))))
print("Precision is {0:.2f}".format(precision_score(y_test_new, np.round(y_predict))))
print("Recall is {0:.2f}".format(recall_score(y_test_new, np.round(y_predict))))
print("AUC is {0:.2f}".format(roc_auc_score(y_test_new, np.round(y_predict))))

accuracy_xgb = accuracy_score(y_test_new, np.round(y_predict))
precision_xgb = precision_score(y_test_new, np.round(y_predict))
recall_xgb = recall_score(y_test_new, np.round(y_predict))
auc_xgb = roc_auc_score(y_test_new, np.round(y_predict))
Accuracy is 0.94
Precision is 1.00
Recall is 0.88
AUC is 0.94
In [147]:
# plotting the accuracy for training and test
plt.figure(figsize=(14, 5))
ax = plt.subplot(111)

models = ['Logistic Regression', 'Decision Tree Gini', 'Decision Tree Entropy', 'Random Forests Gini', 'Random Forests Entropy', 'XGBoost' ]
values = [accuracy_logreg, accuracy_dtg, accuracy_dte, accuracy_forrestg, accuracy_forreste, accuracy_xgb]
model = np.arange(len(models))

plt.bar(model, values, align='center', width = 0.15, alpha=0.7, color = 'red', label= 'accuracy')
plt.xticks(model, models)



ax = plt.subplot(111)

models = ['Logistic Regression', 'Decision Tree Gini', 'Decision Tree Entropy', 'Random Forests Gini', 'Random Forests Entropy', 'XGBoost' ]
values = [precision_logreg, precision_dtg, precision_dte, precision_forrestg, precision_forreste, precision_xgb]
model = np.arange(len(models))

plt.bar(model+0.15, values, align='center', width = 0.15, alpha=0.7, color = 'blue', label = 'precision')
plt.xticks(model, models)



ax = plt.subplot(111)

models = ['Logistic Regression', 'Decision Tree Gini', 'Decision Tree Entropy', 'Random Forests Gini', 'Random Forests Entropy', 'XGBoost']
values = [recall_logreg, recall_dtg, recall_dte, recall_forrestg, recall_forreste, recall_xgb]
model = np.arange(len(models))

plt.bar(model+0.3, values, align='center', width = 0.15, alpha=0.7, color = 'green', label = 'recall')
plt.xticks(model, models)



ax = plt.subplot(111)

models = ['Logistic Regression', 'Decision Tree Gini', 'Decision Tree Entropy', 'Random Forests Gini', 'Random Forests Entropy', 'XGBoost']
values = [auc_logreg, auc_dtg, auc_dte, auc_forrestg, auc_forreste, auc_xgb]
model = np.arange(len(models))

plt.bar(model+0.45, values, align='center', width = 0.15, alpha=0.7, color = 'orange', label = 'AUC')
plt.xticks(model, models)



plt.ylabel('Performance Metrics for Different models')
plt.title('Model')
    
# removing the axis on the top and right of the plot window
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()

plt.show()
/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  warnings.warn(message, mplDeprecation, stacklevel=1)
/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  warnings.warn(message, mplDeprecation, stacklevel=1)
/anaconda3/lib/python3.6/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  warnings.warn(message, mplDeprecation, stacklevel=1)